Import de packages utiles

library(FactoMineR)
library(factoextra)
library(CASdatasets)
library(tidyverse)
library(MASS)
library(knitr)
library(ggplot2)
library(cowplot)
library(reshape2)
library(dplyr)
library(GGally)
library(corrplot)
library(carData) 
library(car)
library(questionr)
library(multcomp)
library(dplyr)
library(leaps)
library(TeachingDemos)
library(FactoMineR)
library(factoextra)
library(ROCR)
library(plotROC)

Documentation et import du dataset

#?CASdatasets #ouvre l'aide pour comprendre le dataset
data(freMPL5)
summary(freMPL5)
##     Exposure         LicAge      RecordBeg            RecordEnd         
##  Min.   :0.001   Min.   : 24   Min.   :2004-01-01   Min.   :2004-01-02  
##  1st Qu.:0.170   1st Qu.:229   1st Qu.:2004-01-01   1st Qu.:2004-05-01  
##  Median :0.403   Median :355   Median :2004-04-15   Median :2004-07-18  
##  Mean   :0.423   Mean   :350   Mean   :2004-05-07   Mean   :2004-07-20  
##  3rd Qu.:0.666   3rd Qu.:463   3rd Qu.:2004-08-11   3rd Qu.:2004-10-14  
##  Max.   :1.000   Max.   :844   Max.   :2004-12-31   Max.   :2004-12-31  
##                                                     NA's   :12818       
##     Gender       MariStat       SocioCateg                      VehUsage    
##  Female: 8856   Alone: 3461   CSP50  :12385   Private               : 8576  
##  Male  :17144   Other:22539   CSP60  : 5646   Private+trip to office:12163  
##                               CSP55  : 3247   Professional          : 4299  
##                               CSP1   :  960   Professional run      :  962  
##                               CSP66  :  672                                 
##                               CSP42  :  655                                 
##                               (Other): 2435                                 
##     DrivAge        HasKmLimit       ClaimAmount       ClaimNbResp    
##  Min.   :20.00   Min.   :0.00000   Min.   :-1842.0   Min.   :0.0000  
##  1st Qu.:39.00   1st Qu.:0.00000   1st Qu.:    0.0   1st Qu.:0.0000  
##  Median :51.00   Median :0.00000   Median :    0.0   Median :0.0000  
##  Mean   :50.45   Mean   :0.07462   Mean   :  204.9   Mean   :0.2663  
##  3rd Qu.:61.00   3rd Qu.:0.00000   3rd Qu.:    0.0   3rd Qu.:0.0000  
##  Max.   :95.00   Max.   :1.00000   Max.   :95151.0   Max.   :4.0000  
##                                                                      
##  ClaimNbNonResp   ClaimNbParking    ClaimNbFireTheft  ClaimNbWindscreen
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.000    
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000    
##  Median :0.0000   Median :0.00000   Median :0.00000   Median :0.000    
##  Mean   :0.3218   Mean   :0.08265   Mean   :0.07638   Mean   :0.418    
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:1.000    
##  Max.   :7.0000   Max.   :4.00000   Max.   :3.00000   Max.   :8.000    
##                                                                        
##     OutUseNb         RiskArea        BonusMalus        ClaimInd    
##  Min.   :0.0000   Min.   : 1.000   Min.   : 50.00   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.: 6.000   1st Qu.: 50.00   1st Qu.:0.000  
##  Median :0.0000   Median : 7.000   Median : 50.00   Median :0.000  
##  Mean   :0.2243   Mean   : 7.845   Mean   : 57.78   Mean   :0.093  
##  3rd Qu.:0.0000   3rd Qu.:10.000   3rd Qu.: 60.00   3rd Qu.:0.000  
##  Max.   :5.0000   Max.   :13.000   Max.   :185.00   Max.   :1.000  
## 
summary(freMPL5$SocioCateg)
##  CSP1 CSP17  CSP2 CSP20 CSP21 CSP22 CSP26  CSP3 CSP30 CSP37 CSP38 CSP40 CSP41 
##   960     2    78    93   171   127   157    29     1   219     2    74     2 
## CSP42 CSP44 CSP45 CSP46 CSP47 CSP48 CSP49 CSP50 CSP51 CSP55 CSP56 CSP57 CSP59 
##   655     1     2   651    44   494    78 12385     6  3247     2     9     2 
##  CSP6 CSP60 CSP61 CSP63 CSP65 CSP66  CSP7 
##   171  5646     1     1    14   672     4

On remarque que certaines variables sont numĂ©riques au lieu dâ€™Ăªtre considĂ©rĂ©es comme des facteurs. Nous allons donc les changer :

freMPL5$HasKmLimit <- factor(freMPL5$HasKmLimit)
freMPL5$ClaimInd <- factor(freMPL5$ClaimInd)
freMPL5$OutUseNb <- as.numeric(freMPL5$OutUseNb)

Quelques nouveaux plots de corrélation

A = kde2d(freMPL5$DrivAge, freMPL5$BonusMalus)
filled.contour(A)

A = kde2d(freMPL5$DrivAge, freMPL5$LicAge)
filled.contour(A)

A = kde2d(freMPL5$DrivAge, freMPL5$Exposure)
filled.contour(A)

A = kde2d(freMPL5$DrivAge, freMPL5$ClaimNbNonResp)
filled.contour(A)

A = kde2d(freMPL5$DrivAge, freMPL5$ClaimNbWindscreen)
filled.contour(A)

A = kde2d(freMPL5$DrivAge, freMPL5$RiskArea)
filled.contour(A)

A = kde2d(freMPL5$BonusMalus, freMPL5$LicAge)
filled.contour(A)

A = kde2d(freMPL5$BonusMalus, freMPL5$Exposure)
filled.contour(A)

A = kde2d(freMPL5$BonusMalus, freMPL5$ClaimNbNonResp)
filled.contour(A)

A = kde2d(freMPL5$BonusMalus, freMPL5$ClaimNbWindscreen)
filled.contour(A)

A = kde2d(freMPL5$BonusMalus, freMPL5$RiskArea)
filled.contour(A)

A = kde2d(freMPL5$LicAge, freMPL5$Exposure)
filled.contour(A)

A = kde2d(freMPL5$LicAge, freMPL5$ClaimNbNonResp)
filled.contour(A)

A = kde2d(freMPL5$LicAge, freMPL5$ClaimNbWindscreen)
filled.contour(A)

A = kde2d(freMPL5$LicAge, freMPL5$RiskArea)
filled.contour(A)

A = kde2d(freMPL5$Exposure, freMPL5$ClaimNbNonResp)
filled.contour(A)

A = kde2d(freMPL5$Exposure, freMPL5$ClaimNbWindscreen)
filled.contour(A)

A = kde2d(freMPL5$Exposure, freMPL5$RiskArea)
filled.contour(A)

A = kde2d(freMPL5$ClaimNbNonResp, freMPL5$ClaimNbWindscreen)
filled.contour(A)

A = kde2d(freMPL5$ClaimNbNonResp, freMPL5$RiskArea)
filled.contour(A)

A = kde2d(freMPL5$ClaimNbWindscreen, freMPL5$RiskArea)
filled.contour(A)

On remarque donc uniquement une réelle forte corrélation entre LicAge et DrivAge pour les variables continues.

ACP

#freMPL veut dire French Motor Personal Line datasets
#On utilisera le dataset 5 qui contient environ 26000 contrats de l'annee 2004

x <- freMPL5[, c(1,2,9,11,12,13,14,15,16,17,18,19)]
corrplot(round(cor(x),2),method="ellipse")

#C'est une ACP sur les données quantitatives (on a ici considéré la zone
#comme faisant partie des variables quantitatives car c'est "possibly ordered")
PCA(x)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 26000 individuals, described by 12 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

ACM

On va ici modifier les données pour faire une ACM.

#On veut ici supprimer les lignes dont les montants réclamés sont négatifs.
#En effet, elles donnent lieu à une régularisation et ne nous apporterons rien.
data(freMPL5)
freMPL5 <- subset(freMPL5, freMPL5$ClaimAmount >= 0)
freMPL5$HasKmLimit <- factor(freMPL5$HasKmLimit)
freMPL5$RiskArea <- factor(freMPL5$RiskArea)
freMPL5$ClaimInd <- factor(freMPL5$ClaimInd)
freMPL5$ClaimNbFireTheft <- factor(freMPL5$ClaimNbFireTheft)
freMPL5$ClaimNbResp <- factor(freMPL5$ClaimNbResp)
freMPL5$ClaimNbNonResp <- factor(freMPL5$ClaimNbNonResp)
freMPL5$ClaimNbParking <- factor(freMPL5$ClaimNbParking)
freMPL5$ClaimNbWindscreen <- factor(freMPL5$ClaimNbWindscreen)
freMPL5$OutUseNb <- factor(freMPL5$OutUseNb)


#Pour faire une analyse de données, nous allons transformer toutes les variables
#quantitatives en variables qualitatives de manière à avoir un nombre homogènes
#d'assurés dans chaque classe.
freMPL5$Exposure <- cut(freMPL5$Exposure, quantile(freMPL5$Exposure, probs = seq(0,1,1/4)), include.lowest = TRUE)
freMPL5$DrivAge <- cut(freMPL5$DrivAge, quantile(freMPL5$DrivAge, probs = seq(0,1,1/6)), include.lowest = TRUE)
freMPL5$LicAge <- cut(freMPL5$LicAge, quantile(freMPL5$LicAge, probs = seq(0,1,1/6)), include.lowest = TRUE)
freMPL5$BonusMalus <- cut(freMPL5$BonusMalus, c(50,54,seq(60, 200, 20)), include.lowest = TRUE)
freMPL5$IntervalCout <- cut(freMPL5$ClaimAmount, c(0,seq(1, 100000, 1000)), include.lowest = TRUE)

Voici la réalisation de l’ACM :

fact <- freMPL5 %>% select_if(is.factor)

#AMC de nos données pour le premier et deuxième axes
res.mca = MCA(fact, ncp = 5, graph = TRUE, axes = c(1,2))

## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

print(res.mca)
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 25722 individuals, described by 18 variables
## *The results are available in the following objects:
## 
##    name              description                       
## 1  "$eig"            "eigenvalues"                     
## 2  "$var"            "results for the variables"       
## 3  "$var$coord"      "coord. of the categories"        
## 4  "$var$cos2"       "cos2 for the categories"         
## 5  "$var$contrib"    "contributions of the categories" 
## 6  "$var$v.test"     "v-test for the categories"       
## 7  "$ind"            "results for the individuals"     
## 8  "$ind$coord"      "coord. for the individuals"      
## 9  "$ind$cos2"       "cos2 for the individuals"        
## 10 "$ind$contrib"    "contributions of the individuals"
## 11 "$call"           "intermediate results"            
## 12 "$call$marge.col" "weights of columns"              
## 13 "$call$marge.li"  "weights of rows"
# Visualisation des résultats des variables
fviz_mca_var(res.mca, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE, 
             ggtheme = theme_minimal())

Les variables Ă  faible valeur de cos2 seront en blanc, les variables Ă  valeur moyenne en bleu et les variables Ă  valeur forte en rouge.

#visualisation de l'AMC de nos variables en fonction du 1er et 3eme axes
res.mca_2 = MCA(fact, ncp = 5, graph = TRUE, axes = c(1,3))

## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

fviz_mca_var(res.mca_2, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE, 
             ggtheme = theme_minimal())

#visualisation de l'AMC de nos variables en fonction du 2eme et 3eme axes
res.mca_3 = MCA(fact, ncp = 5, graph = TRUE, axes = c(2,3))

## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

fviz_mca_var(res.mca_3, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE, 
             ggtheme = theme_minimal())

#biplot de notre premier AMC
fviz_mca_biplot(res.mca,  
               select.ind = list(contrib = 15),
               select.var = list(contrib = 15))

plot(res.mca,invisible=c("var","quali.sup","quanti.sup"),cex=0.7)

plot(res.mca,invisible=c("ind","quali.sup","quanti.sup"),cex=0.8)

plot(res.mca,invisible=c("quali.sup","quanti.sup"),cex=0.8)

#Visualisation des résultats pour les variables "RiskArea,ClaimInd, SocioCateg et MariStat"
plotellipses(res.mca,keepvar=c(4,5,17,15))

#Contributions des variables pour les différents axes
fviz_contrib(res.mca, choice = "var", axes = 1, top = 30)

fviz_contrib(res.mca, choice = "var", axes = 2, top = 30)

fviz_contrib(res.mca, choice = "var", axes = 3, top = 30)

#Visualisation de chacun des individus 
fviz_mca_ind(res.mca, label="none", habillage = 1,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 2,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 3,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 4,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 5,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

fviz_mca_ind(res.mca, label="none", habillage = 6,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 7,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 8,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 9,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 10,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

fviz_mca_ind(res.mca, label="none", habillage = 11,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 12,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 13,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse

fviz_mca_ind(res.mca, label="none", habillage = 14,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 15,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 16,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse

fviz_mca_ind(res.mca, label="none", habillage = 17,
       addEllipses=TRUE, ellipse.level=0.95)

fviz_mca_ind(res.mca, label="none", habillage = 18,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse

fviz_mca_ind(res.mca, label="none", habillage = 5,
       addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse